%load_ext autoreload
%autoreload 2
import networkx as nx
import numpy as np
import geopandas as gpd
import pandas as pd
import scanpy as sc
import voyagerpy as vp
from anndata import AnnData
from matplotlib import pyplot as plt
from matplotlib.pyplot import rc_context
import pathlib
import json
We download the dataset from the 10X website. These are two gzipped tar archives containing the unfiltered gene count matrix and the spatial information
import requests
outs_dir = pathlib.Path('data/visium_10x/outs')
outs_dir.mkdir(parents=True, exist_ok=True)
root_dir = (outs_dir / '..').resolve()
# Download the gene count matrix
tar_path_ob = root_dir / 'visium_ob.tar.gz'
url_reads = "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/Visium_Mouse_Olfactory_Bulb/Visium_Mouse_Olfactory_Bulb_raw_feature_bc_matrix.tar.gz"
if not tar_path_ob.exists():
res = requests.get(url_reads)
with tar_path_ob.open('wb') as f:
f.write(res.content)
# Download the spatial information
tar_path_sp = root_dir / 'visium_ob_spatial.tar.gz'
url_spatial = "https://cf.10xgenomics.com/samples/spatial-exp/2.0.0/Visium_Mouse_Olfactory_Bulb/Visium_Mouse_Olfactory_Bulb_spatial.tar.gz"
if not tar_path_sp.exists():
res = requests.get(url_spatial)
with tar_path_sp.open('wb') as f:
f.write(res.content)
# Decompress the downloaded files
!tar -xvf $tar_path_ob -C $outs_dir
!tar -xvf $tar_path_sp -C $outs_dir
x raw_feature_bc_matrix/ x raw_feature_bc_matrix/features.tsv.gz x raw_feature_bc_matrix/barcodes.tsv.gz x raw_feature_bc_matrix/matrix.mtx.gz x spatial/ x spatial/spatial_enrichment.csv x spatial/tissue_lowres_image.png x spatial/tissue_positions.csv x spatial/detected_tissue_image.jpg x spatial/aligned_fiducials.jpg x spatial/scalefactors_json.json x spatial/tissue_hires_image.png
This is what the layout of the files look like. The outputs in the spatial directory is explained here on the 10X website.
def print_dir_content(p, d=0):
print(' '*(d*2) + p.name, end='/\n' if p.is_dir() else '\n')
if p.is_dir():
for sub in sorted(p.iterdir()):
print_dir_content(sub, d+1)
has_tree = !command -v tree
if has_tree:
!tree $outs_dir
else:
print_dir_content(outs_dir)
data/visium_10x/outs ├── raw_feature_bc_matrix │ ├── barcodes.tsv.gz │ ├── features.tsv.gz │ └── matrix.mtx.gz └── spatial ├── aligned_fiducials.jpg ├── detected_tissue_image.jpg ├── scalefactors_json.json ├── spatial_enrichment.csv ├── tissue_hires_image.png ├── tissue_lowres_image.png ├── tissue_positions.csv └── tissue_positions_list.csv 3 directories, 11 files
The tissue_hires_image.png file is a relatively high resolution image of the tissue, but not
full resolution. The tissue_lowres_image.png file is a low resolution image of the tissue,
suitable for quick plotting.
im = plt.imread(outs_dir/'spatial' / 'tissue_hires_image.png')
fig, ax = plt.subplots()
ax.imshow(im, origin='lower')
plt.show()
json.load((outs_dir / 'spatial' / 'scalefactors_json.json').open())
{'tissue_hires_scalef': 0.2,
'tissue_lowres_scalef': 0.06,
'fiducial_diameter_fullres': 118.91545,
'spot_diameter_fullres': 73.61433}
pd.read_csv(outs_dir / 'spatial' / 'tissue_positions.csv').head()
| barcode | in_tissue | array_row | array_col | pxl_row_in_fullres | pxl_col_in_fullres | |
|---|---|---|---|---|---|---|
| 0 | ACGCCTGACACGCGCT-1 | 0 | 0 | 0 | 8668 | 1102 |
| 1 | TACCGATCCAACACTT-1 | 0 | 1 | 1 | 8611 | 1200 |
| 2 | ATTAAAGCGGACGAGC-1 | 0 | 0 | 2 | 8554 | 1102 |
| 3 | GATAAGGGACGATTAG-1 | 0 | 1 | 3 | 8498 | 1200 |
| 4 | GTGCAAATCACCAATA-1 | 0 | 0 | 4 | 8441 | 1102 |
pd.read_csv(outs_dir / 'spatial' / 'spatial_enrichment.csv').head()
| Feature ID | Feature Name | Feature Type | I | P value | Adjusted p value | Feature Counts in Spots Under Tissue | Median Normalized Average Counts | Barcodes Detected per Feature | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ENSMUSG00000001023 | S100a5 | Gene Expression | 0.770905 | 0.0 | 0.0 | 9019 | 15.848669 | 1021 |
| 1 | ENSMUSG00000019874 | Fabp7 | Gene Expression | 0.698735 | 0.0 | 0.0 | 13462 | 20.679932 | 1170 |
| 2 | ENSMUSG00000002985 | Apoe | Gene Expression | 0.694521 | 0.0 | 0.0 | 67509 | 76.635169 | 1184 |
| 3 | ENSMUSG00000025739 | Gng13 | Gene Expression | 0.658575 | 0.0 | 0.0 | 5260 | 8.803694 | 1050 |
| 4 | ENSMUSG00000090223 | Pcp4 | Gene Expression | 0.631703 | 0.0 | 0.0 | 45118 | 25.811125 | 1133 |
Here we read the Space Ranger output as an AnnData object
adata = vp.read_10x_visium(
outs_dir,
datatype = 'mtx',
raw = True,
prefix = None,
symbol_as_index=False,
dtype=np.float64
)
_ = vp.plt.imshow(adata)
fig, axs = plt.subplots(2, 2, figsize=(10,10))
vp.plt.imshow(adata, ax=axs[0,0], title='Original')
vp.spatial.rotate_img90(adata, k=-1, apply=False)
vp.plt.imshow(adata, tmp=True, ax=axs[0,1], title='Rotated 90° clockwise - not applied')
vp.spatial.mirror_img(adata, axis=1, apply=False)
vp.plt.imshow(adata, tmp=True, ax=axs[1, 0], title='Mirror rows - not applied')
vp.spatial.apply_transforms(adata)
_ = vp.plt.imshow(adata, ax=axs[1,1], title='Transformed image - changes applied')
if adata.uns['config']['var_names'] == 'symbol':
is_mt = adata.var.index.to_series().str.contains('^mt-').values
else:
is_mt = adata.var['symbol'].str.contains('^mt-').values
adata = vp.utl.calculate_metrics(adata)
vp.utils.add_per_cell_qcmetrics(adata, subsets={'mito': is_mt})
adata = vp.spatial.get_geom(adata)
_ = vp.plt.plot_spatial_feature(
adata,
['sum', 'detected', 'subsets_mito_percent'],
tissue=False,
ncol=2
)
adata.obs['in_tissue'] = adata.obs['in_tissue'].astype('category')
axs = vp.plt.plot_barcode_data(
adata,
y=['sum', 'detected', 'subsets_mito_percent'],
x='in_tissue',
ncol=3,
figsize=(8, 4),
cmap='tab10'
)
_ = vp.plt.plot_barcode_data(
adata,
x='sum',
y='subsets_mito_percent',
color_by='in_tissue',
cmap='tab10'
)
adata_tissue = adata[adata.obs["in_tissue"]==1]
adata_tissue = vp.spatial.get_geom(adata_tissue)
_ = vp.plotting.plot_barcodes_bin2d(
adata_tissue,
x='sum',
y='detected',
bins=76,
cmin=1
)
In order to preserve the counts and the log-normalized counts, we save them as layers. This is because we
will normalize adata_tissue.X before we perform PCA in this notebook.
adata_tissue.layers['counts'] = adata_tissue.X.copy()
vp.utils.log_norm_counts(adata_tissue, inplace=True)
adata_tissue.layers['logcounts'] = adata_tissue.X.copy()
In order to compare this notebook to the corresponding R vignette, we use the top highly variable genes acquired from scran's modelGeneVar and getTopHVGs.
use_R_generated_hvgs = True
hvgs_file = (root_dir / 'hvgs.txt')
if use_R_generated_hvgs and hvgs_file.exists():
with hvgs_file.open() as f:
hvgs = list(map(str.strip, f.readlines()))
adata_tissue.var['highly_variable'] = False
if adata_tissue.uns['config']['var_names'] == 'gene_ids':
adata_tissue.var.loc[hvgs, 'highly_variable'] = True
else:
adata_tissue.var['highly_variable'] = adata_tissue.var['gene_ids'].isin(hvgs)
else:
sc.pp.highly_variable_genes(adata_tissue, flavour='cell_ranger', n_top_genes=2000)
In the companion vignette, the data is scaled prior to computing the PCA. Thus, we follow suit.
# scale first, then pca
use_sc = False
if use_sc:
# Use scanpy scaling
sc.pp.scale(adata_tissue, zero_center=True)
else:
# Use voyagerpy scaling
adata_tissue.X = vp.utils.scale(adata_tissue.X, center=True)
sc.tl.pca(adata_tissue, use_highly_variable=True, n_comps=30, random_state=1337)
_ = vp.plt.elbow_plot(adata_tissue, ndims=30)
ax = vp.plt.plot_dim_loadings(adata_tissue, range(5), ncol=2, show_symbol=True)
Cluster the reduced dimension of the barcodes.
sc.pp.neighbors(adata_tissue, n_pcs=3, use_rep='X_pca')
sc.tl.leiden(
adata_tissue,
random_state=29,
resolution=0.3,
key_added='cluster',
)
# sc.tl.umap(adata_tissue)
ax = vp.plt.plot_pca(adata_tissue, figsize=(7,7), ndim=5, colorby='cluster', cmap='tab10')
_ = vp.plt.plot_spatial_feature(adata_tissue, 'cluster', cmap='dittoseq', barcode_geom='spot_poly')
pl = vp.plt.spatial_reduced_dim(
adata_tissue,
"X_pca",
ncomponents=5,
barcode_geom="spot_poly", ncol=2, divergent=True, div_cmap='roma')
genes_use = ['Gm42418', 'Syt1', 'Pcp4', 'Ly6g6e', 'mt-Nd1','Aqp1', 'Ecrg4']
_ = vp.plt.plot_expression(
adata_tissue,
genes_use,
groupby='cluster',
show_symbol=True,
layer='logcounts'
)
_ = vp.plt.plot_spatial_feature(
adata_tissue,
genes_use,
barcode_geom = "spot_poly",
ncol = 2,
layer='logcounts',
)